library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggfortify)
library(fastDummies)
library(mosaic)## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(modelr)##
## Attaching package: 'modelr'
## The following object is masked from 'package:mosaic':
##
## resample
## The following object is masked from 'package:ggformula':
##
## na.warn
houses <- read_csv("data/kc_house_data.csv")## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(houses)## id date price
## Length:21613 Min. :2014-05-02 00:00:00.00 Min. : 75000
## Class :character 1st Qu.:2014-07-22 00:00:00.00 1st Qu.: 321950
## Mode :character Median :2014-10-16 00:00:00.00 Median : 450000
## Mean :2014-10-29 04:38:01.96 Mean : 540088
## 3rd Qu.:2015-02-17 00:00:00.00 3rd Qu.: 645000
## Max. :2015-05-27 00:00:00.00 Max. :7700000
## bedrooms bathrooms sqft_living sqft_lot
## Min. : 0.000 Min. :0.000 Min. : 290 Min. : 520
## 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040
## Median : 3.000 Median :2.250 Median : 1910 Median : 7618
## Mean : 3.371 Mean :2.115 Mean : 2080 Mean : 15107
## 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688
## Max. :33.000 Max. :8.000 Max. :13540 Max. :1651359
## floors waterfront view condition
## Min. :1.000 Min. :0.000000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000
## Median :1.500 Median :0.000000 Median :0.0000 Median :3.000
## Mean :1.494 Mean :0.007542 Mean :0.2343 Mean :3.409
## 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000
## Max. :3.500 Max. :1.000000 Max. :4.0000 Max. :5.000
## grade sqft_above sqft_basement yr_built
## Min. : 1.000 Min. : 290 Min. : 0.0 Min. :1900
## 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951
## Median : 7.000 Median :1560 Median : 0.0 Median :1975
## Mean : 7.657 Mean :1788 Mean : 291.5 Mean :1971
## 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997
## Max. :13.000 Max. :9410 Max. :4820.0 Max. :2015
## yr_renovated zipcode lat long
## Min. : 0.0 Min. :98001 Min. :47.16 Min. :-122.5
## 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3
## Median : 0.0 Median :98065 Median :47.57 Median :-122.2
## Mean : 84.4 Mean :98078 Mean :47.56 Mean :-122.2
## 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1
## Max. :2015.0 Max. :98199 Max. :47.78 Max. :-121.3
## sqft_living15 sqft_lot15
## Min. : 399 Min. : 651
## 1st Qu.:1490 1st Qu.: 5100
## Median :1840 Median : 7620
## Mean :1987 Mean : 12768
## 3rd Qu.:2360 3rd Qu.: 10083
## Max. :6210 Max. :871200
No missing data
# removing columns that aren't needed
houses_tidy <- houses %>%
select(-c(date, id, sqft_living15, sqft_lot15, zipcode)) %>%
# converting waterfront column to logical
mutate(waterfront = as.logical(waterfront)) %>%
# converting yr_renovated to logical renovated column
mutate(yr_renovated = ifelse(yr_renovated == 0, FALSE, TRUE)) %>%
rename(renovated = yr_renovated) %>%
# convert view to factor (categorical ordinal)
mutate(view = factor(view, levels = c(0, 1, 2, 3, 4))) %>%
# convert condition to factor (categorical ordinal)
mutate(condition = factor(condition, levels = c(1, 2, 3, 4, 5))) %>%
# lets group together grade - low (1-3), average (4-10), high (11-13)
mutate(grade = case_when(
grade < 4 ~ "low",
grade >10 ~ "high",
TRUE ~ "average"
),
grade = factor(grade, levels = c("low", "average", "high"))
)alias(lm(price ~ ., data = houses_tidy))## Model :
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
## waterfront + view + condition + grade + sqft_above + sqft_basement +
## yr_built + renovated + lat + long
##
## Complete :
## (Intercept) bedrooms bathrooms sqft_living sqft_lot floors
## sqft_basement 0 0 0 1 0 0
## waterfrontTRUE view1 view2 view3 view4 condition2 condition3
## sqft_basement 0 0 0 0 0 0 0
## condition4 condition5 gradeaverage gradehigh sqft_above yr_built
## sqft_basement 0 0 0 0 -1 0
## renovatedTRUE lat long
## sqft_basement 0 0 0
alias has detected that sqft_basement can be calculated from sqft_living minus sqft_above. This means we can removed sqft_basement.
houses_tidy <- houses_tidy %>%
select(-sqft_basement)houses_tidy_numeric <- houses_tidy %>%
select_if(is.numeric)
houses_tidy_nonnumeric <- houses_tidy %>%
select_if(function(x) !is.numeric(x))
houses_tidy_nonnumeric$price <- houses_tidy$priceggpairs(houses_tidy_numeric, progress = FALSE)Correlation with price - Numeric Variables
Strong positive correlations 1. sqft_living (0.702) 2. sqft_above (0.606)
Moderate correlation 3. bathrooms (0.525)
Weak correlations 4. bedrooms (0.308) 5. latitude (0.307) 6. floors (0.257)
Very week correlations 7. sqft_lot (0.09) 8. yr_built (0.054) 9. longitude (0.022)
ggpairs(houses_tidy_nonnumeric, progress = FALSE)Correlation with price - Non-Numeric Variables
waterfront - appears to affect price
View - there is a correlation with price, the median of the 5 levels generally increases with better views (less obvious between levels 1 and 2). The highest and lowest levels of view clearly have an effect on price.
condition - no obvious correlation here
grade - looks like a good correlation here with a decent increase in price related to the highest grade of building. This looks like the strongest correlation from the non-numeric variables.
renovated - may be a correlation but not a strong one from looking at boxplots
Conclusion predictors to investigate in order of potential grade = “high” waterfront = TRUE view = 4 (or view - 0, I’m not sure)
model1 <- lm(price ~ sqft_living,
data = houses_tidy)
summary(model1)##
## Call:
## lm(formula = price ~ sqft_living, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1476062 -147486 -24043 106182 4362067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43580.743 4402.690 -9.899 <2e-16 ***
## sqft_living 280.624 1.936 144.920 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 261500 on 21611 degrees of freedom
## Multiple R-squared: 0.4929, Adjusted R-squared: 0.4928
## F-statistic: 2.1e+04 on 1 and 21611 DF, p-value: < 2.2e-16
autoplot(model1)plot(model1)Not sure these diagnostic plots look great but have plotted the data below and I think the issue is caused by the shape of the data. It appears to have very defined cut-offs but I don’t know why?
houses %>%
ggplot(aes(price, sqft_living)) +
geom_point(alpha = 0.1)Conclusion I have reservations about the shape of the data and consquently the diagnostic plots but lets go with this for now. R2 = 0.493 rse = 261500 (this is really quite high!) p-value < 0.01
The next strongest correlation for the numeric variables was sqft_above but this is really very similar to sqft_living so lets not do it next. Bathrooms has a moderate correlation with price so should have tried that next but in my exhausted delirium I added bedrooms by accident. Realise when I came back to check things over but when I changed bedrooms to bathrooms I found it didn’t improved the model at all so maybe bedrooms is the better chose as 2nd predictor.
model2 <- lm(price ~ sqft_living + bedrooms,
data = houses_tidy)
summary(model2)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1650867 -143866 -23143 102344 4179850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79469.359 6604.764 12.03 <2e-16 ***
## sqft_living 313.949 2.337 134.31 <2e-16 ***
## bedrooms -57066.759 2308.223 -24.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 257800 on 21610 degrees of freedom
## Multiple R-squared: 0.5068, Adjusted R-squared: 0.5068
## F-statistic: 1.11e+04 on 2 and 21610 DF, p-value: < 2.2e-16
R2 = 0.508 so not added much from model 1 (R2 = 0.493) rse = 257500 still very high
autoplot(model2)plot(model2)We seem to be fitting a negative values - what is going on here?
houses %>%
ggplot(aes(price, bedrooms)) +
geom_point()So there is a house with over 30 bedrooms. This was investigated. This house is noted as having 33 bedrooms but costing $640000. All the other houses that cost this much (there were 21) had between 2 and 6 bedrooms, this is therefore assumed to be a typo and the number of bedrooms changed to 3.
houses_tidy <- houses_tidy %>%
mutate(bedrooms = ifelse(bedrooms > 30, 3, bedrooms)) Run model and diagnostic again…
model2b <- lm(price ~ sqft_living + bedrooms,
data = houses_tidy)
summary(model2b)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1666625 -143358 -23058 102392 4163512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90059.793 6733.139 13.38 <2e-16 ***
## sqft_living 316.914 2.365 134.00 <2e-16 ***
## bedrooms -62063.528 2392.305 -25.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 257500 on 21610 degrees of freedom
## Multiple R-squared: 0.5082, Adjusted R-squared: 0.5081
## F-statistic: 1.116e+04 on 2 and 21610 DF, p-value: < 2.2e-16
The R2 and rse are similar to model2 but the diagnostics should look better.
autoplot(model2b)Hmmm, for plot 2 the residuals are getting larger as the prices get higher. For scale-location, the blue line is not flat and I think it should be. Am starting to have my doubts about using sqft_livng, there is something not right with this data.
Does sqft_above look any better?
houses_tidy %>%
ggplot(aes(price, sqft_above)) +
geom_point()No the sqft_above data looks the same as sqft_living
What if we just try bedrooms on its own as the first predictor?
model1b <- lm(price ~ bedrooms,
data = houses_tidy)
summary(model1b)##
## Call:
## lm(formula = price ~ bedrooms, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -993323 -203016 -65422 105984 6824400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110333 9108 12.11 <2e-16 ***
## bedrooms 127544 2610 48.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 348400 on 21611 degrees of freedom
## Multiple R-squared: 0.0995, Adjusted R-squared: 0.09946
## F-statistic: 2388 on 1 and 21611 DF, p-value: < 2.2e-16
R2 is low, bedrooms isn’t explaining much of the variation. I think we have to go with sqft_living.
Lets press on with model2b (price ~ sqft_living + bedrooms(with 33 altered))
model3 <- lm(price ~ sqft_living + bedrooms + waterfront,
data = houses_tidy)
summary(model3)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms + waterfront, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1547815 -139924 -20327 103317 4271814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83206.467 6497.811 12.80 <2e-16 ***
## sqft_living 304.569 2.302 132.29 <2e-16 ***
## bedrooms -54179.832 2316.240 -23.39 <2e-16 ***
## waterfrontTRUE 790888.877 19706.699 40.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248400 on 21609 degrees of freedom
## Multiple R-squared: 0.5423, Adjusted R-squared: 0.5422
## F-statistic: 8534 on 3 and 21609 DF, p-value: < 2.2e-16
We’re up to R2 = 0.542 now, rse reducing slightly but still very high. p-values very low and diagnostic plots ok
autoplot(model3)We’re up to R2 = 0.542 now, rse reducing slightly but still very high. p-values very low and diagnostic plots ok (ish) (I think??) Something odd has happened to leverage graph
Lets look at the residuals to see what can best explain them.
houses_resid <- houses_tidy %>%
add_residuals(model3)houses_tidy_numeric <- houses_resid %>%
select_if(is.numeric)
houses_tidy_nonnumeric <- houses_resid %>%
select_if(function(x) !is.numeric(x))
houses_tidy_nonnumeric$resid <- houses_resid$residggpairs(houses_tidy_numeric, progress = FALSE)Correlation with Residuals - Numeric Columns
latitude - weak positive (0.396) close to being moderate though yr_built - weak negative (-0.240)
ggpairs(houses_tidy_nonnumeric, progress = FALSE)Correlation with Residuals - Non-Numeric Columns
grade = “high” is showing a bit of correlation None of the others are very convincing
Add grade as 4th predictor
model4 <- lm(price ~ sqft_living + bedrooms + waterfront + grade,
data = houses_tidy)
summary(model4)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms + waterfront + grade,
## data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1583426 -136185 -22248 99412 4189856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82809.000 120395.386 0.688 0.492
## sqft_living 264.470 2.477 106.770 < 2e-16 ***
## bedrooms -40777.877 2274.891 -17.925 < 2e-16 ***
## waterfrontTRUE 759150.255 19121.073 39.702 < 2e-16 ***
## gradeaverage 28275.650 120492.322 0.235 0.814
## gradehigh 485138.645 121159.406 4.004 6.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 240800 on 21607 degrees of freedom
## Multiple R-squared: 0.57, Adjusted R-squared: 0.5699
## F-statistic: 5728 on 5 and 21607 DF, p-value: < 2.2e-16
R2 is now 0.57 with rse 240800 grade = “high” has a p-value < 0.01 but grade = “average” has a high p-value and is not significant.
autoplot(model4)plot(model4)anova(model3, model4)Adding grade is statistically significant
anova(model2b, model3)Adding waterfront is statistically significant.
What if we try latitude as 3rd predictor?
model3b <- lm(price ~ sqft_living + bedrooms + lat,
data = houses_tidy)
summary(model3b)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1657228 -121478 -19654 80401 4191985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.333e+07 5.584e+05 -59.69 <2e-16 ***
## sqft_living 3.073e+02 2.196e+00 139.93 <2e-16 ***
## bedrooms -5.520e+04 2.219e+03 -24.88 <2e-16 ***
## lat 7.026e+05 1.174e+04 59.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 238500 on 21609 degrees of freedom
## Multiple R-squared: 0.5781, Adjusted R-squared: 0.578
## F-statistic: 9870 on 3 and 21609 DF, p-value: < 2.2e-16
Adding latitude as 3rd predictor takes us up to R2 = 0.5781
Lets add previous 3rd predictor in which was waterfront
model4b <- lm(price ~ sqft_living + bedrooms + lat + waterfront,
data = houses_tidy)
summary(model4b)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat + waterfront,
## data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1534034 -116206 -16352 81061 4304633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.392e+07 5.338e+05 -63.55 <2e-16 ***
## sqft_living 2.944e+02 2.118e+00 138.96 <2e-16 ***
## bedrooms -4.691e+04 2.128e+03 -22.04 <2e-16 ***
## lat 7.150e+05 1.122e+04 63.71 <2e-16 ***
## waterfrontTRUE 8.190e+05 1.809e+04 45.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 227900 on 21608 degrees of freedom
## Multiple R-squared: 0.6147, Adjusted R-squared: 0.6146
## F-statistic: 8617 on 4 and 21608 DF, p-value: < 2.2e-16
Cool, now we’re at R2 0.6147 rse still horribly high p-values all very low
autoplot(model4b)Lets look at the residuals again to see what can best explain them.
houses_resid <- houses_tidy %>%
add_residuals(model4b)houses_tidy_numeric <- houses_resid %>%
select_if(is.numeric)
houses_tidy_nonnumeric <- houses_resid %>%
select_if(function(x) !is.numeric(x))
houses_tidy_nonnumeric$resid <- houses_resid$residggpairs(houses_tidy_numeric, progress = FALSE)yr_built (-0.188) and longitude (-0.153) both have very weak negative correlations with the residuals
ggpairs(houses_tidy_nonnumeric, progress = FALSE)Could try view? Grade doesn’t seem to make sense anymore as the lowest grade has a higher median than average and isn’t much different from the high level?
What happens if we add all our remaining potential predictors?? Woohoo, am going nuts!
model5 <- lm(price ~ sqft_living + bedrooms + lat + waterfront + long + view + grade,
data = houses_tidy)
summary(model5)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat + waterfront +
## long + view + grade, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1762508 -104948 -11871 76237 4116519
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.696e+07 1.350e+06 -42.199 <2e-16 ***
## sqft_living 2.496e+02 2.324e+00 107.401 <2e-16 ***
## bedrooms -3.027e+04 2.022e+03 -14.971 <2e-16 ***
## lat 6.761e+05 1.062e+04 63.667 <2e-16 ***
## waterfrontTRUE 4.949e+05 2.079e+04 23.803 <2e-16 ***
## long -2.051e+05 1.090e+04 -18.814 <2e-16 ***
## view1 1.289e+05 1.187e+04 10.856 <2e-16 ***
## view2 9.913e+04 7.151e+03 13.863 <2e-16 ***
## view3 1.724e+05 9.787e+03 17.618 <2e-16 ***
## view4 3.335e+05 1.513e+04 22.037 <2e-16 ***
## gradeaverage -1.755e+05 1.066e+05 -1.646 0.0997 .
## gradehigh 2.463e+05 1.072e+05 2.298 0.0216 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 212800 on 21601 degrees of freedom
## Multiple R-squared: 0.6641, Adjusted R-squared: 0.664
## F-statistic: 3883 on 11 and 21601 DF, p-value: < 2.2e-16
Still only at R2 = 0.664 rse = 212800 so still very big
The grades aren’t statistically significant which isn’t that surprising. Lets change it to condition.
model5b <- lm(price ~ sqft_living + bedrooms + lat + waterfront + long + view + condition,
data = houses_tidy)
summary(model5b)##
## Call:
## lm(formula = price ~ sqft_living + bedrooms + lat + waterfront +
## long + view + condition, data = houses_tidy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1752924 -108306 -10720 82648 4144185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.723e+07 1.391e+06 -41.150 < 2e-16 ***
## sqft_living 2.897e+02 2.196e+00 131.910 < 2e-16 ***
## bedrooms -4.519e+04 2.058e+03 -21.953 < 2e-16 ***
## lat 6.847e+05 1.093e+04 62.629 < 2e-16 ***
## waterfrontTRUE 5.060e+05 2.137e+04 23.680 < 2e-16 ***
## long -2.021e+05 1.124e+04 -17.980 < 2e-16 ***
## view1 1.236e+05 1.221e+04 10.127 < 2e-16 ***
## view2 9.910e+04 7.353e+03 13.477 < 2e-16 ***
## view3 1.742e+05 1.006e+04 17.311 < 2e-16 ***
## view4 3.451e+05 1.556e+04 22.184 < 2e-16 ***
## condition2 3.018e+04 4.329e+04 0.697 0.48569
## condition3 2.548e+04 4.002e+04 0.637 0.52442
## condition4 6.352e+04 4.008e+04 1.585 0.11302
## condition5 1.089e+05 4.033e+04 2.700 0.00693 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 218700 on 21599 degrees of freedom
## Multiple R-squared: 0.6453, Adjusted R-squared: 0.6451
## F-statistic: 3022 on 13 and 21599 DF, p-value: < 2.2e-16
Only condition = 5 is statistically significant. I think this means I have to group them?
Might have to give up here and go to bed. Night night zzzzzzzzz